In [52]:
import numpy as np
import numpy.random as npr
import matplotlib.pyplot as plt
plt.rcParams['font.family']='Serif'
%matplotlib inline
import seaborn as sns
In [34]:
from sklearn import neighbors
from sklearn.neighbors import KernelDensity
kde = KernelDensity(0.2)
In [79]:
npr.seed(0)
kde.fit(npr.rand(100,100)*10)
Out[79]:
In [80]:
X = kde.sample(10000)
density = np.exp(kde.score_samples(X))
In [93]:
X_uniform = npr.rand(1000,2)
density_uniform = np.ones(len(X_uniform))
In [81]:
plt.scatter(X[:,0],X[:,1],c=density,cmap='Blues')
Out[81]:
In [94]:
plt.scatter(X_uniform[:,0],X_uniform[:,1],c=density_uniform,cmap='Blues')
Out[94]:
In [95]:
def local_density_k(X,k=10,metric=None):
if metric != None:
bt = neighbors.BallTree(X,200,metric=metric)
neighbor_graph = neighbors.kneighbors_graph(X,k,'distance')
else:
neighbor_graph = neighbors.kneighbors_graph(X,k,'distance')
distances = np.array(neighbor_graph.mean(1))[:,0]
return 1-((distances - distances.min())/(distances.max() - distances.min()))
def local_density_r(X,r=0.1,metric=None):
if metric != None:
bt = neighbors.BallTree(X,200,metric=metric)
neighbor_graph = neighbors.radius_neighbors_graph(bt,r)
else:
neighbor_graph = neighbors.radius_neighbors_graph(X,r)
counts = np.array(neighbor_graph.sum(1))[:,0]
return ((counts - counts.min())/(counts.max() - counts.min()))
In [101]:
d_k = local_density_k(X_uniform)
d_r = local_density_r(X_uniform,r=0.1)
In [111]:
plt.scatter(d_k,np.log(density_uniform)+npr.randn(len(d_k))*0.001)
Out[111]:
In [103]:
plt.scatter(d_r,np.log(density))
Out[103]:
Kraskov: http://journals.aps.org/pre/pdf/10.1103/PhysRevE.69.066138
Mutual information is defined as: $$ I(X,Y) = \int \int dx dy \mu(x,y) \log \frac{\mu(x,y)}{\mu_x(x) \mu_y(y)} $$
Binned estimator: $$I(X,Y) \approx I_{binned}(X,Y) \equiv \sum_{i,j} p(i,j) \log \frac{p(i,j)}{p_x(i) p_y(j)}$$
In [ ]:
In [166]:
npr.seed(0)
X = npr.rand(1000)
def f(x,noise_size=0.1):
return (x-0.5)**2 + npr.rand()*noise_size
Y = np.array([f(x,0.001) for x in X])
In [167]:
plt.scatter(X,Y)
Out[167]:
In [186]:
def binned_MI_vec(X,Y,bins=50):
hist = np.histogram2d(X,Y,bins=bins)[0]
#hist*np.log(hist / np.max(1,hist[:]))
mi_est = 0
p_x = hist.sum(0)
p_y = hist.sum(1)
ind = np.outer(p_x,p_y)
ind[ind==0]=1
tmp = hist/ind
tmp[tmp<1]=1
n = len(X)
return np.sum((hist)*np.log(tmp))
#for i in range(len(hist)):
# for j in range(len(hist)):
# mi_est += hist[i,j] * np.log(hist[i,j]/np.max(1,(p_x[i]*p_y[j])))
#return mi_est
In [203]:
#def binned_MI_loop(X,Y,bins=50):
bins=50
n = len(X)
hist = np.histogram2d(X,Y,bins=bins)[0] / n
#hist*np.log(hist / np.max(1,hist[:]))
mi_est = 0
p_x = hist.sum(0)
p_y = hist.sum(1)
for i in range(bins):
for j in range(bins):
mi_est += hist[i,j] * np.log(np.max(1,hist[i,j]/np.max(1,(p_x[i]*p_y[j]))))
#return mi_est
mi_est
Out[203]:
In [201]:
plt.imshow(hist)
Out[201]:
In [182]:
hist = np.histogram2d(X,Y,bins=50)[0]
plt.imshow(hist)
Out[182]:
In [183]:
p_x = hist.sum(0)
p_y = hist.sum(1)
ind = np.outer(p_x,p_y)
In [184]:
ind = np.outer(p_x,p_y)
ind[ind==0]=1
ind.min()
Out[184]:
In [185]:
plt.imshow(ind)
Out[185]:
In [178]:
np.min(p_x)
Out[178]:
In [179]:
p_x = hist.sum(0)
p_x.shape,hist.shape
Out[179]:
In [195]:
binned_MI_loop(X,Y)
Out[195]:
In [211]:
from scipy.special import digamma
Eq. 4 from Kraskov $$H(X) \approx \frac{1}{N-1} \sum_{i=1}^{N-1} \log(x_{i+1}-x_i) + \psi(1) - \psi(N)$$
In [237]:
def estimate_entropy(X):
X_ = np.sort(X)
N = len(X)
H = 0
for i in range(N-1):
#H += np.log(X_[i+1]-X_[i])#+digamma(1)-digamma(N)
#H += X_[i+1]-X_[i]
H += np.log(X_[i+1]-X_[i])+digamma(1)-digamma(N)
return (H/(N-1))
In [237]:
In [238]:
X_ = np.sort(X)
In [239]:
plt.plot(X_)
Out[239]:
In [240]:
estimate_entropy(X)
Out[240]:
In [241]:
estimate_entropy(npr.rand(1000))
Out[241]:
In [246]:
estimate_entropy(npr.randn(1000))
Out[246]:
In [247]:
X_ = np.sort(X)
In [248]:
X_[1]-X_[0]
Out[248]:
In [255]:
def estimate_MI_knn(X,Y,k=10):
XY = np.vstack((X,Y)).T
bt = neighbors.BallTree(XY,k)
In [254]:
estimate_MI_knn(X,Y)
In [344]:
XY = np.vstack((X,Y)).T*10
k=100
bt = neighbors.BallTree(XY,k)
dist,ind=bt.query(XY[0],k)
dist = dist[0]
dist_to_k = dist[-1]
dist_to_k
Out[344]:
In [349]:
XY = npr.randn(1000,2)
bt = neighbors.BallTree(XY,k)
In [350]:
dists = bt.query(XY,k)[0]
dists.shape
Out[350]:
In [342]:
dists.shape
Out[342]:
In [345]:
for k in range(1,k)[::10]:
dists_to_k = dists[:,k]
sns.kdeplot(dists_to_k)#,clip=(-0.1,0.6))
#plt.xlim(-0.1,0.6)
In [ ]:
dists_to_k